Project is about a recent incident that happened around Game Stop Short Squeeze. Idea is to extract data from wikipedia (using the rvest package), twitter (using the rtweet package), reddit’s wallstreetbets group data (using RedditExtractoR package), wrangle datasets and make it ready for the next phase of analysis. Eventually these data will be utilized to perform sentiment analysis, quantitative analysis and topic modelling using various techniques available in the field of natural language processing.
As per the review paper by Maurizio Naldi, 2019 around sentiment analysis there are 4 such packages by which sentiment analysis can be performed using R language,
First part of this project is to perform sentiment analysis on Twitter and Reddit’s data extracted in the previous phase and leverage above packages to perform this analysis. Each package covers various aspects of sentiment analysis. Significant features of each package will be explored to provide meaningful insight of sentiment analysis on game stop data. Second part of this project is to perform Quantitative analysis using the quanteda package. Third part of this project is to perform Topic modelling using Latent Dirichlet allocation (LDA), Bag of Words, TF-IDF, Word2Vec etc. methodologies.
References:-
#install.packages("rvest")
#install.packages("tidyverse")
#install.packages("stringr")
#install.packages("sjmisc")
#install.packages("lubridate")
#install.packages("RedditExtractoR")
#install.packages("tm")
#install.packages("syuzhet")
#install.packages("pander")
#install.packages("rlist")
#install.packages("sentimentr")
#install.packages("magrittr")
#install.packages("stringi")
#install.packages("pacman")
#install.packages("textcorpus")
#install.packages("textshape")
#install.packages("textreadr")
#install.packages("textclean")
#install.packages("numform")
#install.packages("xml2")
#install.packages("tidytext")
#install.packages("purrr")
#install.packages(rtweet")
#install.packages(RColorBrewer")
#install.packages(wordcloud")
#install.packages("data.table")
library(rvest)
library(tidyverse)
library(stringr)
library(sjmisc)
library(lubridate)
library(RedditExtractoR)
library(tm)
library(syuzhet)
library(pander)
library(rlist)
library(sentimentr)
library(magrittr)
library(stringi)
library(pacman)
library(textcorpus)
library(textshape)
library(textreadr)
library(textclean)
library(numform)
library(xml2)
library(data.table)
library(tidytext)
library(purrr)
library(rtweet)
library(RColorBrewer)
library(wordcloud)
setwd("~/Documents/MSDS - Rutgers/Spring-2021/16-954-597-01-DATA-WRANGLING/final-project/msds-data-wragling-project")
url <- "https://en.wikipedia.org/wiki/GameStop_short_squeeze"
html <- read_html(url)
# read all html tables in a page
html_table_data <- html %>% html_nodes("table") %>% html_table(fill = TRUE)
# stock prices
other_stock_data <- html_table_data[[3]]
getImpactedStocksData = function(impacted_stocks_df) {
stock_names <- pull(impacted_stocks_df, `Security (symbol)`)
#print(stock_names)
stock_codes <- stock_names %>%
str_extract("\\(+([A-Z]{2,4})\\)+") %>%
str_extract("[A-Z]{2,4}")
#print(stock_codes)
impacted_stock_price_df <- data.frame()
for ( stock_cd in stock_codes) {
#print(stock_cd)
stock_price_df <- getHistoricalStockPriceByStockCode(stock_cd)
impacted_stock_price_df <- rbind(impacted_stock_price_df, stock_price_df)
}
impacted_stock_price_df
}
getHistoricalStockPriceByStockCode = function(stock_code) {
url <- paste0("https://finance.yahoo.com/quote/", stock_code, "/history/")
#print(url)
stock_price_df_list <- url %>%
read_html(url) %>%
html_nodes("table") %>%
html_table(fill = TRUE)
stock_price_df <- stock_price_df_list[[1]]
# removed NA records
stock_price_df <- head(stock_price_df, -1)
stock_price_df$Date <- mdy(stock_price_df$Date)
stock_price_df$Open <-as.numeric(stock_price_df$Open)
stock_price_df$High <-as.numeric(stock_price_df$High)
stock_price_df$Low <-as.numeric(stock_price_df$Low)
stock_price_df$`Close*` <-as.numeric(stock_price_df$`Close*`)
stock_price_df$`Adj Close**` <-as.numeric(stock_price_df$`Adj Close**`)
stock_price_df$`Volume` <-as.numeric(gsub(",", "", stock_price_df$Volume))
stock_price_df$`Stock Code` <- stock_code
stock_price_df
}
gme_stock_df <- getHistoricalStockPriceByStockCode("GME")
ggplot(data = gme_stock_df) +
geom_line(mapping = aes(x = Date, y = `Close*`, color = `Stock Code`)) +
labs(x = "Date", y = "Price") +
ggtitle("GameStop stock price over time.") +
theme(plot.title = element_text(size = 13)) + theme(plot.title = element_text(hjust = 0.5))
ggplot(data = gme_stock_df, mapping = aes(x = Date)) +
geom_col(mapping = aes(y = `Volume`, fill = `Stock Code`)) +
labs(x = "Date", y = "Volume") +
ggtitle("GameStop stock volume trade over time.") +
theme(plot.title = element_text(size = 13)) + theme(plot.title = element_text(hjust = 0.5))
other_stock_data %>%
select(-`Ref.`)
## # A tibble: 22 x 4
## `Security (symbol)` `Price high[a]` `Jan 22` `% chg.`
## <chr> <dbl> <dbl> <chr>
## 1 AMC Entertainment Holdings, Inc. (AMC) 20.4 3.51 480.1%
## 2 AMC Networks Inc. (AMCX) 59.8 49.4 21.2%
## 3 American Airlines Group Inc. (AAL) 21.8 15.8 37.6%
## 4 BB Liquidating Inc. (OTC Pink: BLIAQ) 0.3 0.01 3000%
## 5 Bed Bath & Beyond Inc. (BBBY) 53.9 30.2 78.4%
## 6 BlackBerry Limited (BB) 28.8 14.0 104.9%
## 7 Build-A-Bear Workshop, Inc. (BBW) 8.4 4.52 85.8%
## 8 Eastman Kodak Company (KODK) 15.2 9.46 60.1%
## 9 Express, Inc. (EXPR) 14.0 1.79 680.4%
## 10 Fossil Group, Inc. (FOSL) 28.6 9.87 189.8%
## # … with 12 more rows
impacted_stock_price_df <- getImpactedStocksData(other_stock_data)
ggplot(data = impacted_stock_price_df) +
geom_line(mapping = aes(x = Date, y = `Close*`, color = `Stock Code`)) +
labs(x = "Date", y = "Stock Price") +
facet_wrap(~`Stock Code`, ncol = 3, scales = "free") +
ggtitle("Other Impacted Stocks Price Over Time.") +
theme(plot.title = element_text(size = 13)) + theme(plot.title = element_text(hjust = 0.5))
ggplot(data = impacted_stock_price_df) +
geom_col(mapping = aes(x = Date, y = `Volume`, fill = `Stock Code`)) +
labs(x = "Date", y = "Volume") +
facet_wrap(~`Stock Code`, ncol = 3, scales = "free") +
ggtitle("Other Impacted Stocks Volume Trade Over Time.") +
theme(plot.title = element_text(size = 13)) + theme(plot.title = element_text(hjust = 0.5))
reddit_wallstreetbets <- get_reddit(subreddit = "wallstreetbets", page_threshold = 1, sort_by = "relevance")
## Cannot connect to the website, skipping...
## Cannot connect to the website, skipping...
## Cannot connect to the website, skipping...
## Cannot connect to the website, skipping...
##
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|========= | 12%
|
|============ | 17%
|
|=============== | 21%
|
|================== | 25%
|
|==================== | 29%
|
|======================= | 33%
|
|========================== | 38%
|
|============================= | 42%
|
|================================ | 46%
|
|=================================== | 50%
|
|====================================== | 54%
|
|========================================= | 58%
|
|============================================ | 62%
|
|=============================================== | 67%
|
|================================================== | 71%
|
|==================================================== | 75%
|
|======================================================= | 79%
|
|========================================================== | 83%
|
|============================================================= | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
# sample reddit wallstreetbets data.
reddit_wallstreetbets %>% select(post_date, author, subreddit, comment) %>% head()
## post_date author subreddit
## 1 07-05-21 WhichEdge wallstreetbets
## 2 07-05-21 WhichEdge wallstreetbets
## 3 07-05-21 WhichEdge wallstreetbets
## 4 07-05-21 WhichEdge wallstreetbets
## 5 07-05-21 WhichEdge wallstreetbets
## 6 07-05-21 WhichEdge wallstreetbets
## comment
## 1 I'm not sure why anyone would want in on mortgage company stocks right now. Interest rates are slated to rise after being at lifetime lows for the last few years, while everyone who wanted to refinance has done so already, and housing inventory is at an all time low. Mortgage demand pretty much has no where to go but down at this point.
## 2 So you are saying I\031m screwed on my UWMC calls? I still have 2000 shares.
## 3 No. Hold those. UWMC is a far better company than RKT and the only people who don't realize that are people who don't/have never owned a home. \n\n\nPositions - 400 UWMC, 300 LDI
## 4 P/e of 3 even if rates rise unless they lose two thirds of their revenue this valuation is unjustified
## 5 The whole casino bet, also I think a lot thought after the huge dip it would pick up not further double down on that.\n\nI completely agree with your assessment though, it's refreshing to see some good DD here again finally.
## 6 Damn I'm just going to hold till I die then
#reddit_wallstreetbets_250 <- get_reddit(subreddit = "wallstreetbets", page_threshold = 250, sort_by = "relevance", wait_time = 60)
#write_csv(x = reddit_wallstreetbets_250,
# file = "/Users/swaruprakshit/Documents/MSDS - Rutgers/Spring-2021/16-954-597-01-DATA-WRANGLING/Final Project/final-project-submission/data/reddit_wallstreetbets_data_250.csv",
# na = "NA")
# reading it from previously saved data.
reddit_wallstreetbets <- read.csv("data/reddit_wallstreetbets_data_150.csv", header = TRUE)
reddit_wallstreetbets %>% select(post_date, author, subreddit, comment) %>% head()
## post_date author subreddit
## 1 08-04-21 shdhkxjc wallstreetbets
## 2 08-04-21 shdhkxjc wallstreetbets
## 3 08-04-21 shdhkxjc wallstreetbets
## 4 08-04-21 shdhkxjc wallstreetbets
## 5 08-04-21 shdhkxjc wallstreetbets
## 6 08-04-21 shdhkxjc wallstreetbets
## comment
## 1 What bells are you gonna hear in April when you got January calls, lol
## 2 Not a yolo, this exp next year not in 2 days
## 3 Why the hell are people expecting Disney to do 30% when it\031s currently 50% higher than pre covid level? \n\nI\031ve seen some balance sheet and new openings but it\031s def not the mtk value in pricing of 260 per share
## 4 By the next year not totally impossible.
## 5 If they start streaming movies direct to the Disney channel then it\031s very possible.\n\nI think the opening up is mostly priced in so they will need to add new stuff.
## 6 Stream what, their biggest thing woke Star Wars?
#game_stop_urls <- reddit_urls(search_terms="gamestop", page_threshold = 1) # isolate some URLs
#write_csv(x = game_stop_urls,
# file = "/Users/swaruprakshit/Documents/MSDS - Rutgers/Spring-2021/16-954-597-01-DATA-WRANGLING/Final Project/final-project-submission/data/game_stop_urls.csv",
# na = "NA")
# reading data from previously saved file.
game_stop_urls <- read_csv(file = "data/game_stop_urls.csv")
game_stop_df <- game_stop_urls %>% filter(num_comments==max(game_stop_urls$num_comments)) %$% URL %>% reddit_content # get the contents of a small thread
##
|
| | 0%
|
|======================================================================| 100%
game_stop_user_network <- game_stop_df %>% user_network(include_author=FALSE, agg=TRUE) # extract the network
game_stop_user_network$plot # explore the plot
api_key <- "5HzcNSBdFTpgQmQanbNBdA2jL"
api_key_secret <- "4kLa5QZ04P56vIHedLztI1vZKjipwwC0xxGXrpWSOCRn8wU7vE"
access_token <- "1209907587073884160-vLJDQdjQl6NgrYi0gStwltMfOo5HH5"
access_token_secret <- "e3iwUh3xcZ0ZMvExou8NWUe0O2dvQSPm00ENo4vWTAkgU"
app_name <- "MSDS_FINAL_PROJECT_APP"
## authenticate via web browser
token <- create_token(
app = app_name,
consumer_key = api_key,
consumer_secret = api_key_secret,
access_token = access_token,
access_secret = access_token_secret)
# delay function taken number of seconds
delay <- function(x) {
p1 <- proc.time()
Sys.sleep(x)
proc.time() - p1 # The cpu usage should be negligible
}
# function is responsible for extract data from twitter.
twitter_data_extrctor <- function() {
gamestop_short_squeeze_tweet_master_df <- data.frame()
prev_max_id <- "0"
for (i in 1:1) {
gamestop_short_squeeze_tweet <- search_tweets( q = "#$gme OR #shortsqueeze OR #gamestopshortsqueeze OR #gmeshortsqueeze OR #thebigshortsqueeze OR #gamestop OR #gme",
n = 18000,
type = "mixed",
include_rts = TRUE,
geocode = NULL,
max_id = prev_max_id,
token = bearer_token(),
#retryonratelimit = TRUE,
lang = "en")
prev_max_id <- as.character(min(gamestop_short_squeeze_tweet$status_id))
gamestop_short_squeeze_tweet_master_df <- rbind(gamestop_short_squeeze_tweet_master_df, gamestop_short_squeeze_tweet)
# twitter api has rate limit to 18K records can be extracted in every 15 min.
# delay(900)
}
return (gamestop_short_squeeze_tweet_master_df)
}
# twitter data frame
gamestop_short_squeeze_tweet_master_df <- twitter_data_extrctor()
# sample twitter data.
gamestop_short_squeeze_tweet_master_df %>%
select(user_id, status_id, created_at, screen_name, text) %>%
head()
## # A tibble: 6 x 5
## user_id status_id created_at screen_name text
## <chr> <chr> <dttm> <chr> <chr>
## 1 1584450… 13903005880… 2021-05-06 13:41:05 BetterMark… "Today's @FSCDems 3rd h…
## 2 1584450… 13899633909… 2021-05-05 15:21:11 BetterMark… "The @FSCDems holds its…
## 3 1885094… 13907229600… 2021-05-07 17:39:26 jm_corba "SEC Chair Gary Gensler…
## 4 1885094… 13903402143… 2021-05-06 16:18:33 jm_corba "Glued to the third hea…
## 5 3367334… 13883654827… 2021-05-01 05:31:40 BTCTN "Jerome Powell said tha…
## 6 2754259… 13908007770… 2021-05-07 22:48:39 edOfCHRIST "Don't Want To Say It B…
# used to save large data into csv format.
#write_as_csv(x = gamestop_short_squeeze_tweet_master_df, file_name = "/Users/swaruprakshit/Documents/MSDS - Rutgers/Spring-2021/16-954-597-01-DATA-WRANGLING/Final Project/final-project-submission/data/gamestop_short_squeeze_tweet_master_100k.csv", prepend_ids = TRUE, fileEncoding = "UTF-8")
gamestop_short_squeeze_tweet <- read_twitter_csv(file = "data/gamestop_short_squeeze_tweet_master_100k.csv", unflatten = FALSE)
gamestop_short_squeeze_tweet %>%
select(user_id, status_id, created_at, screen_name, text) %>%
head()
## # A tibble: 6 x 5
## user_id status_id created_at screen_name text
## <chr> <chr> <chr> <chr> <chr>
## 1 118639757 138993398515… 2021-05-05 … kokid951 "I Never Bought any $GME b…
## 2 158445020 138996339090… 2021-05-05 … BetterMark… "The @FSCDems holds its 3r…
## 3 3367334171 138836548277… 2021-05-01 … BTCTN "Jerome Powell said that t…
## 4 1354472301… 139027016095… 2021-05-06 … gmerockets… "This is a really good bre…
## 5 1354472301… 138958153617… 2021-05-04 … gmerockets… "$GME DFV on May 4th\n\n#M…
## 6 1354472301… 138929550337… 2021-05-03 … gmerockets… "Part III of the $GME Cong…
# Function for data cleaning
f_gsub_clean_data <- function (data) {
# remove at people
clean_data = gsub('@\\w+', '', data)
# remove punctuation
clean_data = gsub('[[:punct:]]', '', clean_data)
# remove numbers
clean_data = gsub('[[:digit:]]', '', clean_data)
# remove html links
clean_data = gsub('http\\w+', '', clean_data)
# remove unnecessary spaces
clean_data = gsub('[ \t]{2,}', '', clean_data)
clean_data = gsub('^\\s+|\\s+$', '', clean_data)
# remove emojis or special characters
clean_data = gsub('<.*>', '', enc2native(clean_data))
# to lowercase
clean_data = tolower(clean_data)
# change character encoding
clean_data = iconv(clean_data, to="utf-8-mac")
clean_data
}
reddit_wallstreetbets_comments_clean <- f_gsub_clean_data(reddit_wallstreetbets$comment)
gamestop_short_squeeze_tweet_clean <- f_gsub_clean_data(gamestop_short_squeeze_tweet$text)
The package comes with four sentiment dictionaries and provides a method for accessing the robust, but computationally expensive, sentiment extraction tool developed in the NLP group at Stanford. Use of this later method requires that you have already installed the coreNLP package (see http://nlp.stanford.edu/software/corenlp.shtml).
reddit_wallstreetbets_comments_clean_sentiment <- syuzhet::get_sentiment(reddit_wallstreetbets_comments_clean)
gamestop_short_squeeze_tweet_clean_sentiment <- syuzhet::get_sentiment(gamestop_short_squeeze_tweet_clean)
plot(
reddit_wallstreetbets_comments_clean_sentiment,
type = "l",
main = "Reddit Wallstreetbets Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "blue"
)
plot(
gamestop_short_squeeze_tweet_clean_sentiment,
type = "l",
main = "Gamestop Short Squeeze Twitter Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "red"
)
reddit_wallstreetbets_comments_clean_vector <- syuzhet::get_sentiment(reddit_wallstreetbets_comments_clean, method = "syuzhet")
gamestop_short_squeeze_tweet_clean_vector <- syuzhet::get_sentiment(gamestop_short_squeeze_tweet_clean, method = "syuzhet")
reddit_wallstreetbets_comments_clean_percentage_sentiment <- syuzhet::get_percentage_values(reddit_wallstreetbets_comments_clean_vector, bins = 500)
gamestop_short_squeeze_tweet_clean_percentage_sentiment <- syuzhet::get_percentage_values(gamestop_short_squeeze_tweet_clean_vector, bins = 500)
plot(
reddit_wallstreetbets_comments_clean_percentage_sentiment,
type = "l",
main = "Reddit Wallstreetbets Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "blue"
)
plot(
gamestop_short_squeeze_tweet_clean_percentage_sentiment,
type = "l",
main = "Gamestop Short Squeeze Twitter Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "red"
)
reddit_wallstreetbets_comments_clean_percentage_sentiment <- syuzhet::get_percentage_values(reddit_wallstreetbets_comments_clean_vector, bins = 1000)
gamestop_short_squeeze_tweet_clean_percentage_sentiment <- syuzhet::get_percentage_values(gamestop_short_squeeze_tweet_clean_vector, bins = 1000)
plot(
reddit_wallstreetbets_comments_clean_percentage_sentiment,
type = "l",
main = "Reddit Wallstreetbets Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "blue"
)
plot(
reddit_wallstreetbets_comments_clean_percentage_sentiment,
type = "l",
main = "Gamestop Short Squeeze Twitter Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "red"
)
Explanation:- Unfortunately, percentage value approach does not explain emotional valance trajectory due to following reason, 1. Combining larger chunk (i.e. 500 or 1000 sentence) contains wide range of emotion values than 100 sentence chunk. . Indeed, the means of longer passages tend to converge toward 0.
2. In addition to that, emotion valance changes corpus to corpus. Grouping corpus might be get the sentiment trajectory. Syuzhet package provides two alternatives to percentage-based comparison using either the Fourier or Discrete Cosine Transformations in combination with a low pass filter.
reddit_wallstreetbets_comments_clean_vector <- syuzhet::get_sentiment(reddit_wallstreetbets_comments_clean, method = "syuzhet")
reddit_wallstreetbets_comments_clean_vector_ft_values <- syuzhet::get_transformed_values(reddit_wallstreetbets_comments_clean_vector,
low_pass_size = 3,
x_reverse_len = 100,
padding_factor = 2,
scale_vals = TRUE,
scale_range = FALSE
)
gamestop_short_squeeze_tweet_clean_vector_ft_values <- syuzhet::get_transformed_values(gamestop_short_squeeze_tweet_clean_vector,
low_pass_size = 3,
x_reverse_len = 100,
padding_factor = 2,
scale_vals = TRUE,
scale_range = FALSE
)
plot(
reddit_wallstreetbets_comments_clean_vector_ft_values,
type = "l",
main ="Reddit Wallstreetbets Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "blue"
)
plot(
gamestop_short_squeeze_tweet_clean_vector_ft_values,
type = "l",
main ="Gamestop Short Squeeze Twitter Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "red"
)
reddit_wallstreetbets_comments_clean_vector_dct_values <- syuzhet::get_dct_transform(reddit_wallstreetbets_comments_clean_vector,
low_pass_size = 5,
x_reverse_len = 100,
scale_vals = FALSE,
scale_range = TRUE
)
gamestop_short_squeeze_tweet_clean_vector_dct_values <- syuzhet::get_dct_transform(gamestop_short_squeeze_tweet_clean_vector,
low_pass_size = 5,
x_reverse_len = 100,
scale_vals = FALSE,
scale_range = TRUE
)
plot(
reddit_wallstreetbets_comments_clean_vector_dct_values,
type = "l",
main = "Reddit Wallstreetbets Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "blue"
)
plot(
gamestop_short_squeeze_tweet_clean_vector_dct_values,
type = "l",
main = "Gamestop Short Squeeze Twitter Sentiment Trajectory Over Narrive Time",
xlab = "Narrative Time",
ylab = "Emotional Valence",
col = "red"
)
Explanation:- Main advantage is in its better representation of edge values in the smoothed version of the sentiment vector.
# Reddit Wallstreetbets
syuzhet::simple_plot(reddit_wallstreetbets_comments_clean_vector)
# Gamestop short squeeze twitter sentiment
syuzhet::simple_plot(gamestop_short_squeeze_tweet_clean_vector)
reddit_wallstreetbets_comments_nrc_sentiment <- reddit_wallstreetbets_comments_clean %>%
syuzhet::get_sentences() %>%
syuzhet::get_nrc_sentiment()
gamestop_short_squeeze_tweet_nrc_sentiment <- gamestop_short_squeeze_tweet_clean %>%
syuzhet::get_sentences() %>%
syuzhet::get_nrc_sentiment()
barplot(
sort(colSums(prop.table(reddit_wallstreetbets_comments_nrc_sentiment[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in Wallstreetbets Subreddit Group Comments",
xlab = "Percentage"
)
barplot(
sort(colSums(prop.table(gamestop_short_squeeze_tweet_nrc_sentiment[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in Gamestop Short Squeeze Twitter Comments",
xlab = "Percentage"
)
Explanation:- trust and anticipation top 2 sentiment category in Reddit Wallstreetbets comments, gamestop short queeze twitter tweets. In case of Reddit Wallstreetbets group comments, more than 20% comments are trust related whereas gamestop short squeeze tweets are also more than 20% related to trust and anticipation.
reddit_wallstreetbets_comments_sentiment_text <-
list(reddit_wallstreetbets_comment = reddit_wallstreetbets_comments_clean) %>%
lapply(syuzhet::get_sentences)
gamestop_short_squeeze_tweet_sentiment_text <-
list(gamestop_short_squeeze_tweet = gamestop_short_squeeze_tweet_clean) %>%
lapply(syuzhet::get_sentences)
syuzhet_multiple_sentiment <- function(sentences) {
list(
bing = syuzhet::get_sentiment(sentences, method = "bing"),
afinn = syuzhet::get_sentiment(sentences, method = "afinn"),
nrc = syuzhet::get_sentiment(sentences, method = "nrc"),
syuzhet = syuzhet::get_sentiment(sentences, method = "syuzhet")
)
}
reddit_wallstreetbets_comments_sentiment <- reddit_wallstreetbets_comments_sentiment_text %>%
lapply(syuzhet_multiple_sentiment)
gamestop_short_squeeze_tweet_sentiment <- gamestop_short_squeeze_tweet_sentiment_text %>%
lapply(syuzhet_multiple_sentiment)
sum_up_sentiment <- function(x) {
apply_sentiment <- function(vec) {
list(sum = sum(vec),
mean = mean(vec),
summary = summary(vec))
}
if(is.list(x))
lapply(x, apply_sentiment)
else
apply_sentiment(x)
}
reddit_wallstreetbets_comments_sentiment %>%
lapply(sum_up_sentiment) %>%
list.unzip()
## $bing
## reddit_wallstreetbets_comment
## sum 3749
## mean 0.04649634
## summary Numeric,6
##
## $afinn
## reddit_wallstreetbets_comment
## sum 20325
## mean 0.2520774
## summary Numeric,6
##
## $nrc
## reddit_wallstreetbets_comment
## sum 14500
## mean 0.1798338
## summary Numeric,6
##
## $syuzhet
## reddit_wallstreetbets_comment
## sum 13340.25
## mean 0.1654502
## summary Numeric,6
gamestop_short_squeeze_tweet_sentiment %>%
lapply(sum_up_sentiment) %>%
list.unzip()
## $bing
## gamestop_short_squeeze_tweet
## sum 6971
## mean 0.08911929
## summary Numeric,6
##
## $afinn
## gamestop_short_squeeze_tweet
## sum 36099
## mean 0.4615001
## summary Numeric,6
##
## $nrc
## gamestop_short_squeeze_tweet
## sum 14150
## mean 0.1808977
## summary Numeric,6
##
## $syuzhet
## gamestop_short_squeeze_tweet
## sum 18307.35
## mean 0.2340465
## summary Numeric,6
plot_sentiment <- function(x, title) {
plot(x,
type = "l",
main = title,
xlab = "Narrative time",
ylab = "Emotion valance",
# ylim = c(-1.5, 3.25) # roughly the min and the max
)
abline(h = 0, col = 3, lty = 2) # neutral sentiment
}
reddit_wallstreetbets_comments_sentiment %>%
list.flatten() %>%
lapply(syuzhet::get_percentage_values) %>%
Map(plot_sentiment, ., names(.))
## $reddit_wallstreetbets_comment.bing
## NULL
##
## $reddit_wallstreetbets_comment.afinn
## NULL
##
## $reddit_wallstreetbets_comment.nrc
## NULL
##
## $reddit_wallstreetbets_comment.syuzhet
## NULL
gamestop_short_squeeze_tweet_sentiment %>%
list.flatten() %>%
lapply(syuzhet::get_percentage_values) %>%
Map(plot_sentiment, ., names(.))
## $gamestop_short_squeeze_tweet.bing
## NULL
##
## $gamestop_short_squeeze_tweet.afinn
## NULL
##
## $gamestop_short_squeeze_tweet.nrc
## NULL
##
## $gamestop_short_squeeze_tweet.syuzhet
## NULL
bind_pos <- function(df) {
pos <- data.frame(position = 1:nrow(df))
cbind(df, pos)
}
plot_nrc <- function(df, title) {
ggplot(df, aes(x = position, y = value, color = emotion)) +
geom_smooth(size = 2, se = FALSE) +
xlab("Narrative position") +
ylab("Prevalence") +
theme_classic() +
ggtitle(title)
}
reddit_wallstreetbets_comments_sentiment_text %>%
lapply(syuzhet::get_nrc_sentiment) %>%
lapply(bind_pos) %>%
lapply(gather, emotion, value, -position, -negative, -positive) %>%
Map(plot_nrc, ., names(.))
## $reddit_wallstreetbets_comment
gamestop_short_squeeze_tweet_sentiment_text %>%
lapply(syuzhet::get_nrc_sentiment) %>%
lapply(bind_pos) %>%
lapply(gather, emotion, value, -position, -negative, -positive) %>%
Map(plot_nrc, ., names(.))
## $gamestop_short_squeeze_tweet
Explanation:- after processing more than 80K reddit wallstreetbets comments / gamestop short squeeze twitter tweet found that trust and anticipation are top most emotion stand out. Above chart depicts that over the 80K narrative, how various categories are sentiment changes and trust and anticipantion are top most emotion in that case as well.
Matthew Jockers created the syuzhet package that utilizes dictionary lookups for the Bing, NRC, and Afinn methods as well as a custom dictionary. He also utilizes a wrapper for the Stanford coreNLP which uses much more sophisticated analysis. Jocker’s dictionary methods are fast but are more prone to error in the case of valence shifters.
sentimentr attempts to take into account valence shifters (i.e., negators, amplifiers (intensifiers), de-amplifiers (downtoners), and adversative conjunctions) while maintaining speed. Simply put, sentimentr is an augmented dictionary lookup. The next questions address why it matters.
Well valence shifters affect the polarized words. In the case of negators and adversative conjunctions the entire sentiment of the clause may be reversed or overruled. So if valence shifters occur fairly frequently a simple dictionary lookup may not be modeling the sentiment appropriately. You may be wondering how frequently these valence shifters co-occur with polarized words, potentially changing, or even reversing and overruling the clause’s sentiment. The table below shows the rate of sentence level co-occurrence of valence shifters with polarized words across a few types of texts.
c("I do not like it.", "I really like it.", "I hardly like it.", "I like it but it’s not worth it.") %>%
get_sentences() %>%
sentiment()
## element_id sentence_id word_count sentiment
## 1: 1 1 5 -0.2236068
## 2: 2 1 4 0.4500000
## 3: 3 1 4 0.0500000
## 4: 4 1 9 -0.5623333
comments_attributes_rate <- list(
sentiment_attributes(reddit_wallstreetbets_comments_clean),
sentiment_attributes(gamestop_short_squeeze_tweet_clean)
) %>%
lapply(function(y){
x <- y[['Polarized_Cooccurrences']]
data.frame(setNames(as.list(f_prop2percent(x[[2]], 0)), gsub('-', '', x[[1]])),
stringsAsFacto1rs = FALSE, check.names = FALSE)
}) %>%
setNames(c('Reddit Wallstreetbets Comments', 'Gamestop Short Squeeze Twitter Comments')) %>%
tidy_list('text')
comments_attributes_rate
## text negator amplifier deamplifier
## 1: Reddit Wallstreetbets Comments 26% 17% 5%
## 2: Gamestop Short Squeeze Twitter Comments 28% 15% 2%
## adversative stringsAsFacto1rs
## 1: 13% FALSE
## 2: 8% FALSE
reddit_wallstreetbets_clean_df <- reddit_wallstreetbets %>%
mutate(clean_comments = f_gsub_clean_data(comment))
reddit_wallstreetbets_clean_df$clean_comments %>%
sentimentr::get_sentences() %>%
sentimentr::profanity() %>%
plot()
gamestop_short_squeeze_tweet_clean_df <- gamestop_short_squeeze_tweet %>%
mutate(clean_tweet = f_gsub_clean_data(text))
gamestop_short_squeeze_tweet_clean_df$clean_tweet %>%
sentimentr::get_sentences() %>%
sentimentr::profanity() %>%
plot()
reddit_wallstreetbets_profanity_df <- reddit_wallstreetbets_clean_df$clean_comments %>%
sentimentr::get_sentences() %>%
sentimentr::extract_profanity_terms() %>%
attributes()
reddit_wallstreetbets_profanity_df$counts %>%
head()
## words profanity n
## 1: shit 1 2063
## 2: fuck 1 2031
## 3: fucking 1 1634
## 4: retard 1 681
## 5: ass 1 439
## 6: hell 1 418
gamestop_short_squeeze_tweet_profanity_df <- gamestop_short_squeeze_tweet_clean_df$clean_tweet %>%
sentimentr::get_sentences() %>%
sentimentr::extract_profanity_terms() %>%
attributes()
gamestop_short_squeeze_tweet_profanity_df$counts %>%
head()
## words profanity n
## 1: fucking 1 1004
## 2: shit 1 755
## 3: fuck 1 582
## 4: ass 1 392
## 5: hell 1 325
## 6: tits 1 72
reddit_wallstreetbets_clean_df$clean_comments %>%
sentimentr::get_sentences() %>%
sentimentr::emotion() %>%
plot(drop.unused.emotions = TRUE)
Explanation:- It happened to be trust and anticipation are top most emotion valance as per sentimentr package as well.
gamestop_short_squeeze_tweet_clean_df$clean_tweet %>%
sentimentr::get_sentences() %>%
sentimentr::emotion() %>%
plot(drop.unused.emotions = TRUE)
Explanation:- It happened to be trust and anticipation are top most emotion valance as per sentimentr package as well.
reddit_wallstreetbets_emotion_df <- reddit_wallstreetbets_clean_df$clean_comments %>%
sentimentr::get_sentences() %>%
sentimentr::extract_emotion_terms() %>%
attributes()
reddit_wallstreetbets_emotion_df$counts %>%
distinct(words) %>%
head()
## words
## 1: money
## 2: good
## 3: time
## 4: calls
## 5: shit
## 6: don
gamestop_short_squeeze_tweet_emotion_df <- gamestop_short_squeeze_tweet_clean_df$clean_tweet %>%
sentimentr::get_sentences() %>%
sentimentr::extract_emotion_terms() %>%
attributes()
gamestop_short_squeeze_tweet_emotion_df$counts %>%
distinct(words) %>%
head()
## words
## 1: time
## 2: good
## 3: money
## 4: ill
## 5: love
## 6: manipulation
reddit_wallstreetbets_clean_df$clean_comments %>%
sentimentr::get_sentences() %>%
sentimentr::sentiment_by() %>%
sentimentr::highlight(file = "/Users/swaruprakshit/Documents/MSDS - Rutgers/Spring-2021/16-954-597-01-DATA-WRANGLING/Final Project/final-project-submission/reddit_wallstreetbets_comment_sentimentr_package_highlight.html")
gamestop_short_squeeze_tweet_clean_df$clean_tweet %>%
sentimentr::get_sentences() %>%
sentimentr::sentiment_by() %>%
sentimentr::highlight(file = "/Users/swaruprakshit/Documents/MSDS - Rutgers/Spring-2021/16-954-597-01-DATA-WRANGLING/Final Project/final-project-submission/gamestop_short_squeeze_twitter_tweet_sentimentr_package_highlight.html")
Explanation:- Highlight files should have been created in current working directory. Reddit Wallstreetbets sentiment highlighted in reddit_wallstreetbets_comment_sentimentr_package_highlight.html. Twitter’s tweet sentiment highlighted in gamestop_short_squeeze_twitter_tweet_sentimentr_package_highlight.html.